• CONTEXT: : A telecom company wants to use their historical customer data to predict behaviour to retain customers. You can analyse all relevant customer data and develop focused customer retention programs
• DATA DESCRIPTION: Each row represents a customer, each column contains customer’s attributes described on the column Metadata. The data set includes information about:
• Customers who left within the last month – the column is called Churn
• Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
• Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
• Demographic info about customers – gender, age range, and if they have partners and dependents
• PROJECT OBJECTIVE: Build a model that will help to identify the potential customers who have a higher probability to churn. This help the company to understand the pinpoints and patterns of customer churn and will increase the focus on strategising customer retention.
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder
from scipy import stats
%matplotlib inline
sns.set_style('darkgrid')
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn import model_selection
import warnings
warnings.filterwarnings("ignore")
import plotly
plotly.offline.init_notebook_mode()
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from statistics import mean
from tkinter import *
import tkinter as tk
from tkinter import ttk
import pickle
from sklearn.neighbors import KNeighborsClassifier
os.chdir('C:\\Users\\VGopalak\\Desktop\\ML\\Ensemble Project')
Reading Dataset
df=pd.read_csv('TelcomCustomer-Churn.csv')
df_1=df.copy()
Checking First 5 Rows
df.head()
Shape of the dataset
df.shape
Columns of the dataset
df.columns
Information about the data
df.info()
df.isnull().sum()
df.loc[df['TotalCharges']==' ']
df['TotalCharges'] = df['TotalCharges'].replace(" ", 0).astype('float32')
We can drop customer id column as it is not useful for the model building
df.drop(['customerID'],axis=1,inplace=True)
We can see all the categorical columns has datatype as object.
We need to convert the datatype to categorical
for i in df.select_dtypes(include=['object']).columns:
df[i]=df[i].astype('category') #changing the datatype of column to category
df.dtypes
5 Point Summary
df.describe()
It has only two values 0 and 1. We will change this datatype into categorical.
we have all range of tenure. Mean is greater than median so there might be little positive skewness.
Mean is less than median, there might be left skewness in the data
0 values exists in the column. 75% of values are less than 3786 but the maximum value is 8684.We need to check for the outliers. Mean is greater than median there might be right skewness in the data.
df['SeniorCitizen']=df['SeniorCitizen'].astype('category') #changing senior citizen column to category
Distribution and outlier analysis of numerical variables
for i in df.select_dtypes(include=['int64','float64','float32']).columns:
f, axes = plt.subplots(1, 2, figsize=(17,7))
sns.boxplot(x = i, data=df, orient='h' , ax=axes[1],color="Green")
#sns.set(axis_bgcolor='k')
sns.distplot(df[i], ax=axes[0],color='Green')
axes[0].set_title('Distribution plot of {}'.format(i))
axes[1].set_title('Box plot of {}'.format(i))
plt.show()
#checking count of outliers.
q25,q75=np.percentile(df[i],25),np.percentile(df[i],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[i for i in df[i] if i < lower or i > upper]
print('{} Total Number of outliers in {}: {}'.format('\033[1m',i,len(Outliers)))
Data is not normally distributed
We can see two spikes in the dataset
Most of tenure lies between 10 to 55
We have zero outliers in data
There is no normal distribution
We have no outliers in the data
There is positive skewness in the data
Distribution of categorical variables.
Gender
f,axes=plt.subplots(1,2,figsize=(17,7))
df['gender'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('gender',data=df,ax=axes[1],order=['Male','Female'])
axes[0].set_title('Gender Variable Pie Chart')
axes[1].set_title('Gender Variable Bar Graph')
plt.show()
Senior Citizen
f,axes=plt.subplots(1,2,figsize=(17,7))
df['SeniorCitizen'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('SeniorCitizen',data=df,ax=axes[1],order=[0,1])
axes[0].set_title('SeniorCitizen Variable Pie Chart')
axes[1].set_title('SeniorCitizen Variable Bar Graph')
plt.show()
Partner
f,axes=plt.subplots(1,2,figsize=(17,7))
df['Partner'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('Partner',data=df,ax=axes[1],order=['No','Yes'])
axes[0].set_title('Partner Variable Pie Chart')
axes[1].set_title('Partner Variable Bar Graph')
plt.show()
Dependents
f,axes=plt.subplots(1,2,figsize=(17,7))
df['Dependents'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('Dependents',data=df,ax=axes[1],order=['No','Yes'])
axes[0].set_title('Dependents Variable Pie Chart')
axes[1].set_title('Dependents Variable Bar Graph')
plt.show()
Phone Service
f,axes=plt.subplots(1,2,figsize=(17,7))
df['PhoneService'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('PhoneService',data=df,ax=axes[1],order=['Yes','No'])
axes[0].set_title('PhoneService Variable Pie Chart')
axes[1].set_title('PhoneService Variable Bar Graph')
plt.show()
Multiple Lines
f,axes=plt.subplots(1,2,figsize=(17,7))
df['MultipleLines'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('MultipleLines',data=df,ax=axes[1],order=['No','Yes','No phone service'])
axes[0].set_title('MultipleLines Variable Pie Chart')
axes[1].set_title('MultipleLines Variable Bar Graph')
plt.show()
Internet Service
f,axes=plt.subplots(1,2,figsize=(17,7))
df['InternetService'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('InternetService',data=df,ax=axes[1],order=['Fiber optic','DSL','No'])
axes[0].set_title('InternetService Variable Pie Chart')
axes[1].set_title('InternetService Variable Bar Graph')
plt.show()
OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
col=['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
for i in col:
f,axes=plt.subplots(1,2,figsize=(17,7))
df[i].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot(i,data=df,ax=axes[1],order=['No','Yes','No internet service'])
axes[0].set_title('{} Variable Pie Chart'.format(i))
axes[1].set_title('{} Variable Bar Graph'.format(i))
plt.show()
Contract
f,axes=plt.subplots(1,2,figsize=(17,7))
df['Contract'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('Contract',data=df,ax=axes[1],order=['Month-to-month','Two year','One year'])
axes[0].set_title('Contract Variable Pie Chart')
axes[1].set_title('Contract Variable Bar Graph')
plt.show()
Paperless Billing
f,axes=plt.subplots(1,2,figsize=(17,7))
df['PaperlessBilling'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('PaperlessBilling',data=df,ax=axes[1],order=['Yes','No'])
axes[0].set_title('PaperlessBilling Variable Pie Chart')
axes[1].set_title('PaperlessBilling Variable Bar Graph')
plt.show()
Payment Method
f,axes=plt.subplots(1,2,figsize=(17,7))
df['PaymentMethod'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('PaymentMethod',data=df,ax=axes[1],order=['Electronic check','Mailed check','Bank transfer (automatic)','Credit card (automatic)'])
axes[0].set_title('PaymentMethod Variable Pie Chart')
axes[1].set_title('PaymentMethod Variable Bar Graph')
plt.show()
Distribution of target variable
Churn
f,axes=plt.subplots(1,2,figsize=(17,7))
df['Churn'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('Churn',data=df,ax=axes[1],order=['No','Yes'])
axes[0].set_title('Churn Variable Pie Chart')
axes[1].set_title('Churn Variable Bar Graph')
plt.show()
Numerical vs Numerical
Tenure vs MonthlyCharges
figure = plt.figure(figsize=(50,25))
ax = sns.regplot(x='MonthlyCharges',y='tenure', data=df) # regression plot - scatter plot with a regression line
Tenure vs TotalCharges
figure = plt.figure(figsize=(50,25))
ax = sns.regplot(x='TotalCharges',y='tenure', data=df) # regression plot - scatter plot with a regression line
Correlation among pairs of continuous variables
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.1f', center = 1 ) # heatmap
plt.show()
Categorical vs numerical
Monthly charges vs churn
plt.figure(figsize=(15,7))
sns.boxplot(x='Churn', y='MonthlyCharges', data= df)
plt.show()
df.groupby(by=['Churn'])['MonthlyCharges'].mean().reset_index().sort_values(['Churn']).tail(10).plot(x='Churn',
y='MonthlyCharges',
kind='bar',
figsize=(15,5))
plt.show()
Total charges vs churn
plt.figure(figsize=(15,7))
sns.boxplot(x='Churn', y='TotalCharges', data= df)
plt.show()
df.groupby(by=['Churn'])['TotalCharges'].mean().reset_index().sort_values(['Churn']).tail(10).plot(x='Churn',
y='TotalCharges',
kind='bar',
figsize=(15,5))
plt.show()
Tenure vs Churn
plt.figure(figsize=(15,7))
sns.boxplot(x='Churn', y='tenure', data= df)
plt.show()
df.groupby(by=['Churn'])['tenure'].mean().reset_index().sort_values(['Churn']).tail(10).plot(x='Churn',
y='tenure',
kind='bar',
figsize=(15,5))
plt.show()
Category vs category
data=df.copy() #taking copy of original dataframe
# Reassign target
data.Churn.replace(to_replace = dict(Yes = 1, No = 0), inplace = True)
def barplot(var_select, x_no_numeric) :
tmp1 = data[(data['Churn'] != 0)]
tmp2 = data[(data['Churn'] == 0)]
tmp3 = pd.DataFrame(pd.crosstab(data[var_select],data['Churn']), )
tmp3['Attr%'] = tmp3[1] / (tmp3[1] + tmp3[0]) * 100
if x_no_numeric == True :
tmp3 = tmp3.sort_values(1, ascending = False)
trace1 = go.Bar(
x=tmp1[var_select].value_counts().keys().tolist(),
y=tmp1[var_select].value_counts().values.tolist(),
text=tmp1[var_select].value_counts().values.tolist(),
textposition = 'auto',
name='Churn : yes',opacity = 0.8, marker=dict(
color='gold',
line=dict(color='#000000',width=1)))
trace2 = go.Bar(
x=tmp2[var_select].value_counts().keys().tolist(),
y=tmp2[var_select].value_counts().values.tolist(),
text=tmp2[var_select].value_counts().values.tolist(),
textposition = 'auto',
name='Churn : no', opacity = 0.8, marker=dict(
color='lightblue',
line=dict(color='#000000',width=1)))
trace3 = go.Scatter(
x=tmp3.index,
y=tmp3['Attr%'],
yaxis = 'y2',
name='% Churn', opacity = 0.6, marker=dict(
color='black',
line=dict(color='#000000',width=0.5
)))
layout = dict(title = str(var_select), autosize = False,
height = 500,
width = 800,
xaxis=dict(),
yaxis=dict(title= 'Count'),
yaxis2=dict(range= [-0, 75],
overlaying= 'y',
anchor= 'x',
side= 'right',
zeroline=False,
showgrid= False,
title= '% Churn'
))
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
plotly.offline.iplot(fig)
Gender vs Churn
barplot('gender', True)
Senior Citizen vs churn
barplot('SeniorCitizen', True)
Partner vs Churn
barplot('Partner', True)
Dependents vs churn
barplot('Dependents', True)
Phone service vs churn
barplot('PhoneService', True)
multiple lines vs churn
barplot('MultipleLines', True)
InternetService vs churn
barplot('InternetService', True)
OnlineSecurity vs churn
barplot('OnlineSecurity', True)
OnlineBackup vs churn
barplot('OnlineBackup', True)
DeviceProtection vs churn
barplot('DeviceProtection', True)
TechSupport vs churn
barplot('TechSupport', True)
StreamingTV vs churn
barplot('StreamingTV', True)
StreamingMovies vs churn
barplot('StreamingMovies', True)
Contract vs churn
barplot('Contract', True)
PaperlessBilling vs churn
barplot('PaperlessBilling', True)
PaymentMethod vs churn
barplot('PaymentMethod', True)
MonthlyCharges vs Churn vs Payment Method
plt.figure(figsize=(15,5))
sns.pointplot(x="PaymentMethod", y="MonthlyCharges", hue = 'Churn', data=df)
plt.show()
MonthlyCharges vs Churn vs Contract
plt.figure(figsize=(15,5))
sns.pointplot(x="Contract", y="MonthlyCharges", hue = 'Churn', data=df)
plt.show()
MonthlyCharges vs Churn vs InternetService
plt.figure(figsize=(15,5))
sns.pointplot(x="InternetService", y="MonthlyCharges", hue = 'Churn', data=df)
plt.show()
We can statistically check whether all the independent variables has significant effect on target variables
class_summary=df.groupby('Churn') #getting mean values of each class for all independent variables
class_summary.mean().reset_index()
Statistical Testing of continuous feature with target variable
col=list(df.select_dtypes(include=['int64','float64','float32']).columns)
Here we will be using two-sample unpaired t-test
Ho(Null Hypothesis):There is no significant difference in independent feature with different category of Target variable
H1(Alternate Hypothesis):There is significant difference in independent feature with different category of Target variable
for i in col:
x = np.array(df[df.Churn == 'No'][i])
y = np.array(df[df.Churn == 'Yes'][i])
t, p_value = stats.ttest_ind(x,y, axis = 0,equal_var=False)
print('{} P_Value:{}'.format('\033[1m',p_value))
if p_value < 0.05: # Setting our significance level at 5%
print('{} Rejecting Null Hypothesis.{} of Churned and non-churned customers are not same'.format('\033[1m',i))
else:
print('{} Fail to Reject Null Hypothesis.{} of Churned and non-churned customers are same'.format('\033[1m',i))
print('\n')
Statistical Testing of categorical features with target variable
Ho: There is no significant difference in gender for different category of target variable
H1: There is significant difference in gender for different category of target variable
crosstab=pd.crosstab(df['Churn'],df['gender'])
print(crosstab)
Do these data provide sufficient evidence at the 5% significance level to infer that there are differences in gender among Churn (Yes and No)?
chi,p_value,dof,expected=stats.chi2_contingency(crosstab)
print('P_Value:', p_value)
if p_value < 0.05: # Setting our significance level at 5%
print('{} Rejecting Null Hypothesis. \n There is significant difference in gender for different category of target variable(Churn)'.format('\033[1m'))
else:
print('{} Fail to Reject Null Hypothesis.\n There is no significant difference in gender for different category of target variable(Churn)'.format('\033[1m'))
Similarly we will check for other categorical columns
cat_col=list(df.select_dtypes(include=['category']).columns)
cat_col.remove('gender')
cat_col.remove('Churn')
for i in cat_col:
crosstab=pd.crosstab(df['Churn'],df[i])
chi,p_value,dof,expected=stats.chi2_contingency(crosstab)
print(i+':')
print('\n')
if p_value < 0.05: # Setting our significance level at 5%
print('{} Rejecting Null Hypothesis. \n There is significant difference in {} Feature for different category of target variable(Churn)'.format('\033[1m',i))
else:
print('{} Fail to Reject Null Hypothesis.\n There is no significant difference in {} Feature for different category of target variable(Churn)'.format('\033[1m',i))
print('\n')
Seperating categorical and numerical columns
#Target columns
target_col = ["Churn"]
#categorical columns
cat_cols = df.nunique()[df.nunique() < 10].keys().tolist()
cat_cols = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols = [x for x in df.columns if x not in cat_cols + target_col]
#Binary columns with 2 values
bin_cols = df.nunique()[df.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]
Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
df[i] = le.fit_transform(df[i])
one-hot encoding for multi-value columns
df = pd.get_dummies(data = df,columns = multi_cols,drop_first=True )
Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(df[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)
#dropping original values merging scaled values for numerical columns
df_data_og = df.copy()
df = df.drop(columns = num_cols,axis = 1)
df = df.merge(scaled,left_index=True,right_index=True,how = "left")
#df_copy=df.copy()
#for i in df.columns:
#if i not in ('tenure','MonthlyCharges','TotalCharges'):
#df[i]=df[i].astype('category') #changing datatype to category.
df.dtypes
We will drop multi-collinear columns.
data=df.drop('Churn',axis=1)
#Threshold for removing correlated variables
threshold = 0.9
# Absolute value correlation matrix
corr_matrix = data.corr().abs()
corr_matrix.head()
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
print('There are %d columns to remove :' % (len(to_drop)))
#data = data.drop(columns = to_drop)
print(to_drop)
We have 7 columns to drop
df.drop(columns=to_drop,axis=1,inplace=True)
plt.figure(figsize=(17,7))
df['Churn'].value_counts().plot.pie(autopct='%1.1f%%')
plt.show()
There is imbalance in target variable.
If the imbalanced data is not treated beforehand, then this will degrade the performance of the classifier model. Most of the predictions will correspond to the majority class and treat the minority class features as noise in the data and ignore them. This will result in a high bias in the model.
A widely adopted technique for dealing with highly unbalanced datasets is called resampling
Two widely used resampling methods:
Undersampling: It is the process where you randomly delete some of the observations from the majority class in order to match the numbers with the minority class.
Oversampling: It is the process of generating synthetic data that tries to randomly generate a sample of the attributes from observations in the minority class
# Arrange data into independent variables and dependent variables
X=df.drop(columns='Churn')
y=df['Churn'] #target
Building model without sampling
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
model_entropy=DecisionTreeClassifier(random_state=1,criterion='entropy')
model_entropy.fit(X_train, y_train)
y_pred=model_entropy.predict(X_test)
print('Performance on train data: ',model_entropy.score(X_train, y_train)) # performance on train data
print('Performance on test data: ',model_entropy.score(X_test, y_test)) # performance on test data
Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Non-Churn","Churn"]],
columns = [i for i in ["Non-Churn","Churn"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
plt.show()
Classification Report
print("classification Matrix:\n",classification_report(y_test,y_pred))
we are doing smote only for training data
smote=SMOTE(random_state=1)
x_train_res, y_train_res = smote.fit_resample(X_train, y_train)
model_entropy=DecisionTreeClassifier(random_state=1,criterion='entropy')
model_entropy.fit(x_train_res, y_train_res)
y_pred=model_entropy.predict(X_test)
print('Performance on train data: ',model_entropy.score(x_train_res, y_train_res)) # performance on train data
print('Performance on test data: ',model_entropy.score(X_test, y_test)) # performance on test data
print("classification Matrix:\n",classification_report(y_test,y_pred))
We will Regularize/prune the decision tree by limiting the max. depth of trees and print the accuracy.
clf_pruned = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
max_depth=3, min_samples_leaf=5)
clf_pruned.fit(x_train_res, y_train_res)
y_pred=clf_pruned.predict(X_test)
print('Performance on train data: ',clf_pruned.score(x_train_res, y_train_res)) # performance on train data
print('Performance on test data: ',clf_pruned.score(X_test, y_test)) # performance on test data
print("classification Matrix:\n",classification_report(y_test,y_pred))
Grid Search CV
parameter={'max_depth':np.arange(1,5),'criterion':['entropy','gini'],
'max_features':np.arange(1,10),'min_samples_leaf':np.arange(1,5)
#,'max_leaf_nodes':np.arange(2,20)
}
GS=GridSearchCV(DecisionTreeClassifier(random_state=1),parameter,cv=5)
GS.fit(x_train_res, y_train_res)
GS.best_params_
We got the best paramters by using grid search cv
clf_pruned = DecisionTreeClassifier(criterion = "entropy", random_state=1,
max_depth=4, max_features=7,min_samples_leaf=1)
clf_pruned.fit(x_train_res, y_train_res)
y_pred=clf_pruned.predict(X_test)
print('Performance on train data: ',clf_pruned.score(x_train_res, y_train_res)) # performance on train data
print('Performance on test data: ',clf_pruned.score(X_test, y_test)) # performance on test data
print("classification Matrix:\n",classification_report(y_test,y_pred))
As our point of interest is predicting class 1. This parameter selection gives 82% accuracy on predicting class 1. This is more generalized model.
print (pd.DataFrame(clf_pruned.feature_importances_, columns = ["Imp"], index = x_train_res.columns).sort_values(by=['Imp'],ascending=False))
Dropping column which is not contributing to model.
l=[]
a=pd.DataFrame(clf_pruned.feature_importances_, columns = ["Imp"], index = x_train_res.columns).sort_values(by=['Imp'],ascending=False)
for i in a[a['Imp']<=0].index:
l.append(i)
X_1=X.drop(columns=l,axis=1)
Building Model with selected paramteres
smote=SMOTE(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=0.30, random_state=1)
x_train_res, y_train_res = smote.fit_resample(X_train, y_train)
clf_pruned = DecisionTreeClassifier(criterion = "entropy", random_state=1,
max_depth=4, max_features=7,min_samples_leaf=1)
clf_pruned.fit(x_train_res, y_train_res)
y_pred=clf_pruned.predict(X_test)
print('Performance on train data: ',clf_pruned.score(x_train_res, y_train_res)) # performance on train data
print('Performance on test data: ',clf_pruned.score(X_test, y_test)) # performance on test data
print("classification Matrix:\n",classification_report(y_test,y_pred))
After training model with selected features we can see equal training and testing performance. This will reduce complexity in the model
We will build bagging on top of the decision tree which we built above
Defining grid search paramter
params = {'n_estimators': np.arange(45,55)}
bgcl = BaggingClassifier(base_estimator=clf_pruned,random_state=1)
bc_grid = GridSearchCV(estimator=bgcl, param_grid=params, cv=5)
bc_grid.fit(x_train_res, y_train_res)
bc_grid.best_params_
bgcl = BaggingClassifier(base_estimator=clf_pruned,n_estimators=45,random_state=1)
bgcl.fit(x_train_res, y_train_res)
y_pred=bgcl.predict(X_test)
print('Performance on train data: ',bgcl.score(x_train_res, y_train_res)) # performance on train data
print('Performance on test data: ',bgcl.score(X_test, y_test)) # performance on test data
print("classification Matrix:\n",classification_report(y_test,y_pred))
Performance of bagging is similar to Decision Tree.
parameter={'max_depth':np.arange(1,5),'criterion':['entropy','gini'],
'max_features':np.arange(1,10),'min_samples_leaf':np.arange(1,5)
,'n_estimators': np.arange(45,55)
}
RFG=GridSearchCV(RandomForestClassifier(random_state=1),parameter,cv=5)
RFG.fit(x_train_res, y_train_res)
RFG.best_params_
These are the best parameters for random forest.
rfcl = RandomForestClassifier(criterion='gini',n_estimators = 54,max_depth=4, min_samples_leaf=3 ,random_state=1,max_features=2)
rfcl = rfcl.fit(x_train_res, y_train_res)
y_pred=rfcl.predict(X_test)
print('Performance on train data: ',rfcl.score(x_train_res, y_train_res)) # performance on train data
print('Performance on test data: ',rfcl.score(X_test, y_test)) # performance on test data
print("classification Matrix:\n",classification_report(y_test,y_pred))
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Non-Churn","Churn"]],
columns = [i for i in ["Non-Churn","Churn"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
plt.show()
Doing Ada boost on the best decision tree which we built above.
params = {'n_estimators': np.arange(45,55),'learning_rate':[.001,0.01,.1]}
#abcl = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=1),random_state=1)
abcl = AdaBoostClassifier(base_estimator=clf_pruned,random_state=1)
abcl_grid = GridSearchCV(estimator=abcl, param_grid=params, cv=5)
abcl_grid.fit(x_train_res, y_train_res)
abcl_grid.best_params_
abcl = AdaBoostClassifier(base_estimator=clf_pruned,n_estimators=48,learning_rate=0.1,random_state=1)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(x_train_res, y_train_res)
y_pred=abcl.predict(X_test)
print('Performance on train data: ',abcl.score(x_train_res, y_train_res)) # performance on train data
print('Performance on test data: ',abcl.score(X_test, y_test)) # performance on test data
print("classification Matrix:\n",classification_report(y_test,y_pred))
params = {'n_estimators': np.arange(45,55), 'max_features':np.arange(1,10),
'max_depth':np.arange(1,5),'learning_rate':[.001,0.01,.1],'loss': ['deviance', 'exponential']}
gbcl = GradientBoostingClassifier(random_state=1)
gbcl_grid = GridSearchCV(estimator=gbcl, param_grid=params, cv=5)
gbcl_grid.fit(x_train_res, y_train_res)
gbcl_grid.best_params_
gbcl = GradientBoostingClassifier(learning_rate=0.1,loss='deviance',max_depth=4,max_features=8
,n_estimators = 50,random_state=1)
gbcl = gbcl.fit(x_train_res, y_train_res)
y_pred=gbcl.predict(X_test)
print('Performance on train data: ',gbcl.score(x_train_res, y_train_res)) # performance on train data
print('Performance on test data: ',gbcl.score(X_test, y_test)) # performance on test data
print("classification Matrix:\n",classification_report(y_test,y_pred))
KNN = KNeighborsClassifier(n_neighbors= 5 , metric = 'euclidean' ) #Building knn with 5 neighbors
KNN.fit(x_train_res, y_train_res)
y_pred = KNN.predict(X_test)
print('Accuracy on Training data:',KNN.score(x_train_res, y_train_res) )
print('Accuracy on Test data:',KNN.score(X_test, y_test) )
print("classification Matrix:\n",classification_report(y_test,y_pred))
We are getting good training score but very less testing score
Recall value is nearly equal to both classes.
Checking with different K value
k=[1,3,5,7,9,11,13,15,17,19]
for i in k:
KNN = KNeighborsClassifier(n_neighbors=i, metric = 'euclidean' ) #Building knn with 5 neighbors
KNN.fit(x_train_res, y_train_res)
predicted_labels = KNN.predict(X_test)
print('Accuracy on Training data for k {} is {}:'.format(i,KNN.score(x_train_res, y_train_res)))
print('Accuracy on Test data for k {} is {}:'.format(i,KNN.score(X_test, y_test)))
print("classification Matrix:\n",classification_report(y_test,predicted_labels))
In all k-values we are getting very good training score and less testing score.
def score_model(model,params,cv):
smote=SMOTE(random_state=1)
train_score=[]
test_score=[]
for train_fold_index,test_fold_index in cv.split(X_1,y):
X_train,X_test=X.iloc[train_fold_index],X.iloc[test_fold_index]
y_train,y_test=y.iloc[train_fold_index],y.iloc[test_fold_index]
X_train_res,y_train_res=smote.fit_resample(X_train,y_train)
dtree=model(**params).fit(X_train_res,y_train_res)
y_pred=dtree.predict(X_test)
train_score.append(dtree.score(X_train_res,y_train_res))
test_score.append(dtree.score(X_test,y_test))
print('Train_score:',np.mean(np.array(train_score)))
print('Test_score:',np.mean(np.array(test_score)))
Here we will use these parameters and build model using k-fold to get better model.
Decision Tree Score
cv=StratifiedKFold(n_splits=10,random_state=1)
params={'criterion': 'entropy', 'max_depth': 4, 'max_features': 7, 'min_samples_leaf': 1,'random_state':1}
score_model(DecisionTreeClassifier,params,cv)
Bagging
cv=StratifiedKFold(n_splits=10,random_state=1)
params={'base_estimator':clf_pruned,'n_estimators':45,'random_state':1}
score_model(BaggingClassifier,params,cv)
Random Forest
cv=StratifiedKFold(n_splits=10,random_state=1)
#RandomForestClassifier(criterion='gini',n_estimators = 54,max_depth=4, min_samples_leaf=3 ,random_state=1,max_features=2)
params={'criterion':'gini','n_estimators':54,'max_depth':4,'min_samples_leaf':3,'random_state':1,'max_features':2}
score_model(RandomForestClassifier,params,cv)
Ada Boost
cv=StratifiedKFold(n_splits=10,random_state=1)
params={'base_estimator':clf_pruned,'n_estimators':48,'learning_rate':0.1,'random_state':1}
#AdaBoostClassifier(base_estimator=clf_pruned,n_estimators=48,learning_rate=0.1,random_state=1)
score_model(AdaBoostClassifier,params,cv)
Gradient Boosting
cv=StratifiedKFold(n_splits=10,random_state=1)
params={'loss':'deviance','n_estimators':50,'learning_rate':0.1,'max_depth':4,'max_features':8,'random_state':1}
#GradientBoostingClassifier(learning_rate=0.1,loss='deviance',max_depth=4,max_features=8
# ,n_estimators = 50,random_state=1)
score_model(GradientBoostingClassifier,params,cv)
I will choose Ada Boost as a final model because it has good testing score. Both training and test scores are balanced. Recall value of predicting class 1 is good compared to others.
Model Pickle
# Save to file in the current working directory
pkl_filename = "pickle_adamodel.pkl"
with open(pkl_filename, 'wb') as file:
pickle.dump(abcl, file)
# Load from file
with open(pkl_filename, 'rb') as file:
pickle_model = pickle.load(file)
I have pickled the model so that i can use it in any machine.
win=Tk()
win.geometry("400x600")
win.title("Diabetes Predictions")
Gender=Label(win,text="Gender:").grid(row=0,column=0)
Gender_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Gender_var).grid(row=0,column=1)
Senior_Citizen=Label(win,text="Senior Citizen:").grid(row=1,column=0)
Sen_var=tk.IntVar()
entry_frame= Entry(win,width=30,textvariable=Sen_var).grid(row=1,column=1)
Partner=Label(win,text="Partner:").grid(row=2,column=0)
Partner_var=tk.IntVar()
entry_frame= Entry(win,width=30,textvariable=Partner_var).grid(row=2,column=1)
Dep=Label(win,text="Dependents:").grid(row=3,column=0)
Dep_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Dep_var).grid(row=3,column=1)
Ten=Label(win,text="Tenure:").grid(row=4,column=0)
Ten_var=tk.IntVar()
entry_frame= Entry(win,width=30,textvariable=Ten_var).grid(row=4,column=1)
Phone=Label(win,text="Phone Services:").grid(row=5,column=0)
Phone_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Phone_var).grid(row=5,column=1)
Mul=Label(win,text="Multiple Lines:").grid(row=6,column=0)
Mul_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Mul_var).grid(row=6,column=1)
Int=Label(win,text="Internet Service:").grid(row=7,column=0)
Int_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Int_var).grid(row=7,column=1)
Os=Label(win,text="Online Security:").grid(row=8,column=0)
Os_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Os_var).grid(row=8,column=1)
Ob=Label(win,text="Online Backup:").grid(row=9,column=0)
Ob_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Ob_var).grid(row=9,column=1)
Dev=Label(win,text="Device Protection:").grid(row=10,column=0)
Dev_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Dev_var).grid(row=10,column=1)
Ts=Label(win,text="Tech Support:").grid(row=11,column=0)
Ts_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Ts_var).grid(row=11,column=1)
St=Label(win,text="Streaming Tv:").grid(row=12,column=0)
St_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=St_var).grid(row=12,column=1)
Sm=Label(win,text="Streaming Movies:").grid(row=13,column=0)
Sm_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Sm_var).grid(row=13,column=1)
Cont=Label(win,text="Contract:").grid(row=14,column=0)
Cont_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Cont_var).grid(row=14,column=1)
Pb=Label(win,text="Paperless Billing:").grid(row=15,column=0)
Pb_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Pb_var).grid(row=15,column=1)
Pm=Label(win,text="Payment Method:").grid(row=16,column=0)
Pm_var=tk.StringVar()
entry_frame= Entry(win,width=30,textvariable=Pm_var).grid(row=16,column=1)
Monthly_charges=Label(win,text="Monthly Charges:").grid(row=17,column=0)
Mon_var=tk.IntVar()
entry_frame= Entry(win,width=30,textvariable=Mon_var).grid(row=17,column=1)
Total_charges=Label(win,text="Total Charges:").grid(row=18,column=0)
Tot_var=tk.IntVar()
entry_frame= Entry(win,width=30,textvariable=Tot_var).grid(row=18,column=1)
def output():
predicted=pickle_model.predict(DS)
Predict_entrybox=ttk.Entry(win,width=16)
Predict_entrybox.grid(row=21,column=1)
if predicted==1:
Predict_entrybox.insert(1,str('Yes'))
elif predicted==0:
Predict_entrybox.insert(1,str('No'))
def transform(DS):
DS['InternetService_Fiber optic']=(DS['InternetService']=='Fiber optic')*1
DS['InternetService_No']=(DS['InternetService']=='No')*1
DS['OnlineSecurity_Yes']=(DS['OnlineSecurity']=='Yes')*1
DS['Contract_One year']=(DS['Contract']=='One year')*1
DS['Contract_Two year']=(DS['Contract']=='Two year')*1
#dropping original column
DS.drop('InternetService',axis=1,inplace=True)
DS.drop('OnlineSecurity',axis=1,inplace=True)
DS.drop('Contract',axis=1,inplace=True)
col=['Partner', 'InternetService_Fiber optic', 'InternetService_No',
'OnlineSecurity_Yes', 'Contract_One year', 'Contract_Two year',
'tenure', 'MonthlyCharges', 'TotalCharges']
for i in DS.columns:
if i not in col:
DS.drop(i,axis=1,inplace=True)
#scaling numerical columns
std = StandardScaler()
scaled = std.fit_transform(df[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)
DS = DS.drop(columns = num_cols,axis = 1)
DS = DS.merge(scaled,left_index=True,right_index=True,how = "left")
#ordering the columns according to X_Test
DS=DS[['Partner', 'InternetService_Fiber optic', 'InternetService_No',
'OnlineSecurity_Yes', 'Contract_One year', 'Contract_Two year',
'tenure', 'MonthlyCharges', 'TotalCharges']]
#print(predicted)
DF = pd.DataFrame()
def action():
global DS
DB=pd.DataFrame()
DF = pd.DataFrame(columns=['gender','SeniorCitizen','Partner','Dependents','tenure','PhoneService','MultipleLines',
'InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
'StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','MonthlyCharges',
'TotalCharges'])
GENDER=Gender_var.get()
DF.loc[0,'gender']=GENDER
SEN=Sen_var.get()
DF.loc[0,'SeniorCitizen']=SEN
PARTNER=Partner_var.get()
DF.loc[0,'Partner']=PARTNER
DEP=Dep_var.get()
DF.loc[0,'Dependents']=DEP
TEN=Ten_var.get()
DF.loc[0,'tenure']=TEN
PHONE=Phone_var.get()
DF.loc[0,'PhoneService']=PHONE
MUL=Mul_var.get()
DF.loc[0,'MultipleLines']=MUL
INT=Int_var.get()
DF.loc[0,'InternetService']=INT
OS=Os_var.get()
DF.loc[0,'OnlineSecurity']=OS
OB=Ob_var.get()
DF.loc[0,'OnlineBackup']=OB
DEV=Dev_var.get()
DF.loc[0,'DeviceProtection']=DEV
TS=Ts_var.get()
DF.loc[0,'TechSupport']=TS
ST=St_var.get()
DF.loc[0,'StreamingTV']=ST
SM=Sm_var.get()
DF.loc[0,'StreamingMovies']=SM
CONT=Cont_var.get()
DF.loc[0,'Contract']=CONT
PB=Pb_var.get()
DF.loc[0,'PaperlessBilling']=PB
PM=Pm_var.get()
DF.loc[0,'PaymentMethod']=PM
MON=Mon_var.get()
DF.loc[0,'MonthlyCharges']=MON
TOT=Tot_var.get()
DF.loc[0,'TotalCharges']=TOT
DS=DF
transform(DS)
submit_button=ttk.Button(win,text="Submit",command=action)
submit_button.grid(row=19,column=1)
predict_button=ttk.Button(win,text="Predict",command=output)
predict_button.grid(row=20,column=1)
win.mainloop()